Load Packages
library(here)
## here() starts at C:/Users/dinap/Desktop/Research/sustainable_communities
library(mclust)
## Package 'mclust' version 5.4.7
## Type 'citation("mclust")' for citing this R package in publications.
library(raster)
## Loading required package: sp
library(rgdal)
## rgdal: version: 1.5-18, (SVN revision 1082)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.0.4, released 2020/01/28
## Path to GDAL shared files: C:/Users/dinap/Documents/R/win-library/4.0/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ runtime: Rel. 6.3.1, February 10th, 2020, [PJ_VERSION: 631]
## Path to PROJ shared files: C:/Users/dinap/Documents/R/win-library/4.0/rgdal/proj
## Linking to sp version:1.4-4
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
library(sp)
library(RColorBrewer)
library(knitr)
Import Data
#import raw data
dat_raw <- read.csv(here("data/CommunityData-raw-2015.csv"))
#select columns of interest, removed co2_hhs
dat <- dat_raw[,c(4:12,14:21)]
#convert to dataframe
dat <- as.data.frame(unclass(dat))
#set rownames to city names
rownames(dat)=dat_raw$ME
#look at data
summary(dat)
## AQI_Good Bachelor_Over_25 Per_Poverty Gini
## Min. :0.01701 Min. : 8.40 Min. : 5.10 Min. :36.49
## 1st Qu.:0.75251 1st Qu.:17.00 1st Qu.:13.00 1st Qu.:43.15
## Median :0.79131 Median :21.40 Median :15.90 Median :45.13
## Mean :0.78634 Mean :23.37 Mean :16.97 Mean :45.25
## 3rd Qu.:0.88055 3rd Qu.:28.20 3rd Qu.:19.30 3rd Qu.:47.06
## Max. :0.99945 Max. :65.50 Max. :64.10 Max. :59.00
## NA's :17
## non_migration Per_Sev_Hous Xstreamlengthimpaired Per_Avg_Land_Cov
## Min. :60.40 Min. : 7.565 Min. : 0.3649 Min. : 0.00
## 1st Qu.:82.50 1st Qu.:12.978 1st Qu.: 8.0551 1st Qu.:25.60
## Median :85.20 Median :15.059 Median :12.1582 Median :50.55
## Mean :84.67 Mean :15.667 Mean :18.9924 Mean :46.10
## 3rd Qu.:87.50 3rd Qu.:17.506 3rd Qu.:22.2555 3rd Qu.:63.10
## Max. :95.20 Max. :32.368 Max. :90.4849 Max. :96.82
## NA's :12 NA's :28 NA's :36 NA's :28
## poor_health_percent Z_Water_Index Index_Black Index_Asian
## Min. : 5.20 Min. :0.00000 Min. :0.1267 Min. :0.0000
## 1st Qu.:13.30 1st Qu.:0.02821 1st Qu.:0.4130 1st Qu.:0.4109
## Median :16.18 Median :0.06296 Median :0.4837 Median :0.4822
## Mean :16.79 Mean :0.09666 Mean :0.4860 Mean :0.4861
## 3rd Qu.:19.80 3rd Qu.:0.12259 3rd Qu.:0.5592 3rd Qu.:0.5598
## Max. :38.50 Max. :0.60450 Max. :0.8114 Max. :0.8697
## NA's :45 NA's :37 NA's :28 NA's :28
## Index_Latino GHG_Percap UNEMPLOY FOOD_INS
## Min. :0.04181 Min. : 4.276 Min. :0.0203 Min. : 6.00
## 1st Qu.:0.28938 1st Qu.: 12.849 1st Qu.:0.0579 1st Qu.:11.60
## Median :0.36608 Median : 16.406 Median :0.0717 Median :13.50
## Mean :0.36338 Mean : 18.690 Mean :0.0759 Mean :13.95
## 3rd Qu.:0.42962 3rd Qu.: 21.099 3rd Qu.:0.0881 3rd Qu.:15.60
## Max. :0.92438 Max. :166.428 Max. :0.3562 Max. :31.20
## NA's :28 NA's :30 NA's :28
## VIO_CRIME
## Min. : 0.0
## 1st Qu.: 176.6
## Median : 278.7
## Mean : 308.4
## 3rd Qu.: 405.0
## Max. :1238.4
## NA's :29
head(dat)
## AQI_Good Bachelor_Over_25 Per_Poverty Gini
## Adjuntas, PR 0.9330234 18.1 64.1 57.04
## Aguadilla-Isabela, PR 0.9330234 19.7 53.1 51.94
## Alexander City, AL NA 18.2 21.2 45.36
## Arecibo, PR 0.9330234 22.0 48.3 51.85
## Atmore, AL NA 12.1 23.8 48.44
## Bonham, TX NA 16.2 14.8 43.55
## non_migration Per_Sev_Hous Xstreamlengthimpaired
## Adjuntas, PR NA NA NA
## Aguadilla-Isabela, PR NA NA NA
## Alexander City, AL 90.9 NA NA
## Arecibo, PR NA NA NA
## Atmore, AL 93.7 NA NA
## Bonham, TX 80.7 NA NA
## Per_Avg_Land_Cov poor_health_percent Z_Water_Index
## Adjuntas, PR NA NA NA
## Aguadilla-Isabela, PR NA NA NA
## Alexander City, AL NA NA NA
## Arecibo, PR NA NA NA
## Atmore, AL NA NA NA
## Bonham, TX NA NA NA
## Index_Black Index_Asian Index_Latino GHG_Percap UNEMPLOY
## Adjuntas, PR NA NA NA NA 0.3562
## Aguadilla-Isabela, PR NA NA NA NA 0.2375
## Alexander City, AL NA NA NA NA 0.0980
## Arecibo, PR NA NA NA NA 0.1649
## Atmore, AL NA NA NA NA 0.1524
## Bonham, TX NA NA NA NA 0.0641
## FOOD_INS VIO_CRIME
## Adjuntas, PR NA NA
## Aguadilla-Isabela, PR NA NA
## Alexander City, AL NA NA
## Arecibo, PR NA NA
## Atmore, AL NA NA
## Bonham, TX NA NA
# remove any records that have NAs
dat = na.omit(dat)
# convert to z-scores
dat <- as.matrix(scale(dat, center = TRUE, scale = TRUE))
Run Cluster GMM
#run mclust on 1 to 20 clusters
dat_mc <- Mclust(dat, G=c(1:20))
#print summary of model fit
summary(dat_mc)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 10 components:
##
## log-likelihood n df BIC ICL
## -16206.34 886 485 -35704.23 -35902.49
##
## Clustering table:
## 1 2 3 4 5 6 7 8 9 10
## 50 59 53 62 150 129 51 66 142 124
Model comparison
#plot BIC scores of different models
plot(dat_mc, what='BIC')

#zoom in on best-fitting models
plot(dat_mc, what='BIC', ylim=c(-37000,-35000))

#model type VVE chosen as best, with similar BIC scores for 9-12 clusters
#to do additional model comparisons, generate and compare VVE models with 9-12 clusters
dat_mc9 <- Mclust(dat, G=9, modelNames="VVE")
dat_mc10 <- Mclust(dat, G=10, modelNames="VVE")
dat_mc11 <- Mclust(dat, G=11, modelNames="VVE")
dat_mc12 <- Mclust(dat, G=12, modelNames="VVE")
#extract BIC scores
BICs<-c(dat_mc9$BIC[1],dat_mc10$BIC[1],dat_mc11$BIC[1],dat_mc12$BIC[1])
#plot BIC scores
plot(c(9:12),BICs, pch=16, ylab="BIC Score",xlab="Number of Clusters", type='o')

#calculate change in BIC score, since in this method the goal to to maximize BIC
#we calculate change from highest scoring model
delta_BIC <- max(BICs) - BICs
#calculate BIC weights
w_BIC <- round(exp(-0.5*delta_BIC)/sum(exp(-0.5*delta_BIC)), digits=3)
#make table of results
results_table <- cbind.data.frame(clusters=c(9:12),BIC=BICs,delta_BIC,weight=w_BIC)
#order by delta
results_table <- results_table[order(delta_BIC),]
rownames(results_table)<-NULL
#print table
kable(results_table[1:4], caption = "Model Comparison")
Model Comparison
| 10 |
-35704.23 |
0.00000 |
1 |
| 12 |
-35731.46 |
27.23133 |
0 |
| 11 |
-35811.06 |
106.82569 |
0 |
| 9 |
-35818.70 |
114.46901 |
0 |
Extract and map cluster classifications
#extract classifications, e.g., which MSA belongs to which cluster, write to CSV
write.csv(dat_mc10$classification,file=here("results/cluster_assignment.csv"))
data<-read.csv("results/cluster_assignment.csv")
#import MSA boundaries shapefile
msa_Boundary <-readOGR(here("data/MSA"),"tl_2015_us_cbsa")
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\dinap\Desktop\Research\sustainable_communities\data\MSA", layer: "tl_2015_us_cbsa"
## with 929 features
## It has 12 fields
## Integer64 fields read as strings: ALAND AWATER
#merge polygons with cluster IDs
merged <- merge(msa_Boundary,data,by.x="NAME",by.y="X")
#remove any cases MSAs with no cluster assignments
merged_clean <- merged[!is.na(merged$x),]
#convert cluster ID to factor
merged_clean$x_f <- as.factor(merged_clean$x)
#plot
spplot(merged_clean, "x_f", col.regions=brewer.pal(10,'Spectral'))

Examine how clusters score in different metrics
#new data frame
dat_clust <- as.data.frame(dat)
#add names
dat_clust$ME <- rownames(dat)
rownames(dat_clust) <- NULL
#merge in cluster assignments
dat_clust <- merge(dat_clust,data,by.x="ME",by.y="X")
#calculate cluster means for different variables
c1m <- data.frame(one=colMeans(subset(dat_clust, x=="1")[2:17]))
c2m <- data.frame(two=colMeans(subset(dat_clust, x=="2")[2:17]))
c3m <- data.frame(three=colMeans(subset(dat_clust, x=="3")[2:17]))
c4m <- data.frame(four=colMeans(subset(dat_clust, x=="4")[2:17]))
c5m <- data.frame(five=colMeans(subset(dat_clust, x=="5")[2:17]))
c6m <- data.frame(six=colMeans(subset(dat_clust, x=="6")[2:17]))
c7m <- data.frame(seven=colMeans(subset(dat_clust, x=="7")[2:17]))
c8m <- data.frame(eight=colMeans(subset(dat_clust, x=="8")[2:17]))
c9m <- data.frame(nine=colMeans(subset(dat_clust, x=="9")[2:17]))
c10m <- data.frame(ten=colMeans(subset(dat_clust, x=="10")[2:17]))
cluster_means <- cbind.data.frame(c1m,c2m,c3m,c4m,c5m,c6m,c7m,c8m, c9m, c10m)
cluster_means$variable <- row.names(cluster_means)
rownames(cluster_means) <- NULL
#boxplots of scores for clusters
par(mfrow=c(2,5), mar=c(2.5,5,2.5,3))
boxplot(subset(dat_clust, x=="1")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 1")
boxplot(subset(dat_clust, x=="2")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 2")
boxplot(subset(dat_clust, x=="3")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 3")
boxplot(subset(dat_clust, x=="4")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 4")
boxplot(subset(dat_clust, x=="5")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 5")
boxplot(subset(dat_clust, x=="6")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 6")
boxplot(subset(dat_clust, x=="7")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 7")
boxplot(subset(dat_clust, x=="8")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 8")
boxplot(subset(dat_clust, x=="9")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 9")
boxplot(subset(dat_clust, x=="10")[2:17], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 10")

par(mfrow=c(1,1))
#dotplots of cluster mean values
par(mfrow=c(2,5))
dotchart(cluster_means$one, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 1', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$two, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 2', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$three, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 3', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$four, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 4', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$five, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 5', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$six, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 6', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$seven, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 7', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$eight, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 8', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$nine, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 9', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$ten, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 10', pch=16)
abline(v=0, lwd=2)

par(mfrow=c(1,1))
Dimension reduction
#run discriminant analysis on clusters
DR <- MclustDR(dat_mc10, lambda = 1) #setting lambda to 1 gives most separating directions
summary(DR)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: Mclust (VVE, 10)
##
## Clusters n
## 1 50
## 2 59
## 3 53
## 4 62
## 5 150
## 6 129
## 7 51
## 8 66
## 9 142
## 10 124
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3 Dir4 Dir5
## AQI_Good -0.176001 0.0167446 0.2544551 -0.0041113 -0.199672
## Bachelor_Over_25 0.072840 -0.2897267 0.3135171 -0.5609009 0.421081
## Per_Poverty 0.022156 0.2216694 0.6106306 -0.1721047 0.485515
## Gini -0.018424 -0.1870715 -0.0774274 -0.0713467 -0.052922
## non_migration 0.122296 0.0306189 -0.1004147 0.4486234 0.108786
## Per_Sev_Hous -0.096523 -0.2903113 -0.5171334 -0.2399188 0.140611
## Xstreamlengthimpaired 0.474859 0.2044703 -0.1971267 0.1582116 0.240913
## Per_Avg_Land_Cov -0.253104 -0.3671406 -0.0276019 0.2069777 -0.338410
## poor_health_percent -0.160339 -0.3429140 -0.0287426 0.2401105 0.097547
## Z_Water_Index -0.751602 0.6152728 -0.1207784 0.0101883 0.270779
## Index_Black 0.012783 0.0364651 -0.1388193 -0.0924190 0.063046
## Index_Asian 0.030941 -0.1078755 0.1763719 0.0492666 0.086927
## Index_Latino 0.053899 0.0566457 -0.0075445 0.2151389 -0.028245
## GHG_Percap -0.134751 -0.0033918 0.1659712 0.0513501 0.460770
## UNEMPLOY 0.016303 -0.0286237 -0.2089966 0.0738562 0.156282
## FOOD_INS 0.113812 -0.2359471 0.0577083 0.3324095 -0.089733
## VIO_CRIME -0.151807 -0.0315984 -0.0236525 0.3006170 0.013413
## Dir6 Dir7 Dir8 Dir9
## AQI_Good 0.247547 0.396043 0.0874908 -0.0832171
## Bachelor_Over_25 -0.170414 -0.031725 -0.3655544 0.0382844
## Per_Poverty -0.064981 0.201735 -0.6034751 -0.4234419
## Gini -0.079564 -0.351825 0.2765681 -0.1192998
## non_migration 0.292492 -0.165084 -0.1740190 -0.2571548
## Per_Sev_Hous 0.351465 0.364975 0.3325524 -0.2378273
## Xstreamlengthimpaired -0.438677 0.568038 -0.0946994 -0.0208607
## Per_Avg_Land_Cov -0.188715 0.293543 -0.4090132 0.0051191
## poor_health_percent -0.296444 0.183355 0.1365397 0.1702002
## Z_Water_Index -0.218556 0.071558 -0.0198185 0.0456237
## Index_Black 0.105431 -0.086784 -0.1339241 0.4102571
## Index_Asian -0.134440 -0.048924 0.0340747 -0.1862869
## Index_Latino 0.026287 -0.080645 -0.1474826 -0.1506777
## GHG_Percap 0.322789 0.095990 -0.0615401 0.1927214
## UNEMPLOY 0.388493 0.056520 -0.0461231 0.2111648
## FOOD_INS -0.116806 -0.038561 0.1871896 0.5775028
## VIO_CRIME -0.166452 -0.198262 0.0084581 -0.0539742
##
## Dir1 Dir2 Dir3 Dir4 Dir5 Dir6 Dir7
## Eigenvalues 0.94731 0.67751 0.50022 0.43587 0.32617 0.18446 0.13214
## Cum. % 28.77575 49.35584 64.55074 77.79092 87.69873 93.30189 97.31587
## Dir8 Dir9
## Eigenvalues 0.06914 0.019223
## Cum. % 99.41608 100.000000
#plot eigenvalues
plot(c(1:17),DR$evalues,ylab="eigenvalues", xlab="component", type="o", pch=16)

#plot all pairs of combinations
plot(DR, what='pairs',symbols=c("1","2","3","4","5","6","7","8","9","10"),
colors=c("red","blue","green","goldenrod2","violet","brown","black","grey30","slateblue2","seagreen3"))

#plot directions for variables
Directions <- data.frame(d1=DR$basis[,1],d2=DR$basis[,2],
d3=DR$basis[,3],d4=DR$basis[,4],
d5=DR$basis[,5],d6=DR$basis[,6],
d7=DR$basis[,7],d8=DR$basis[,8],
d9=DR$basis[,9],var=rownames(DR$basis))
par(mfrow=c(3,3))
dotchart(Directions$d1, labels=Directions$var, pch=16, main="Dir1", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d2, labels=Directions$var, pch=16, main="Dir2", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d3, labels=Directions$var, pch=16, main="Dir3", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d4, labels=Directions$var, pch=16, main="Dir4", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d5, labels=Directions$var, pch=16, main="Dir5", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d6, labels=Directions$var, pch=16, main="Dir6", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d7, labels=Directions$var, pch=16, main="Dir7", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d8, labels=Directions$var, pch=16, main="Dir8", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)
dotchart(Directions$d9, labels=Directions$var, pch=16, main="Dir9", xlim=c(-0.6,0.6))
abline(v=0, lwd=2)
abline(v=c(0.1,-0.1),lty=2)

par(mfrow=c(1,1))